{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Retrieving CAS registry numbers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import pubchempy as pcp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Enable debug logging to make it easier to see what is going on:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "logging.getLogger('pubchempy').setLevel(logging.DEBUG)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A function to get the CAS registry numbers for compounds with a particular SMILES substructure:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_substructure_cas(smiles):\n",
    "    cas_rns = []\n",
    "    results = pcp.get_synonyms(smiles, 'smiles', searchtype='substructure')\n",
    "    for result in results:\n",
    "        for syn in result.get('Synonym', []):\n",
    "            match = re.match('(\\d{2,7}-\\d\\d-\\d)', syn)\n",
    "            if match:\n",
    "                cas_rns.append(match.group(1))\n",
    "    return cas_rns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test some inputs:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON\n",
      "DEBUG:pubchempy:Request data: smiles=%5BPb%5D\n",
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/3178699647975629202/synonyms/JSON\n",
      "DEBUG:pubchempy:Request data: None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2174\n",
      "[u'7439-92-1', u'15875-18-0', u'54076-28-7', u'14701-27-0', u'15158-12-0', u'52229-97-7', u'724427-66-1', u'598-63-0', u'13427-42-4', u'17398-75-3']\n"
     ]
    }
   ],
   "source": [
    "cas_rns = get_substructure_cas('[Pb]')\n",
    "print(len(cas_rns))\n",
    "print(cas_rns[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON\n",
      "DEBUG:pubchempy:Request data: smiles=%5BSe%5D\n",
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/852509384123203131/synonyms/JSON\n",
      "DEBUG:pubchempy:Request data: None\n",
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/852509384123203131/synonyms/JSON\n",
      "DEBUG:pubchempy:Request data: None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14577\n",
      "[u'10102-18-8', u'26970-82-1', u'15498-87-0', u'7782-82-3', u'14013-56-0', u'14013-56-0', u'29528-97-0', u'50647-14-8', u'7782-49-2', u'11125-23-8']\n"
     ]
    }
   ],
   "source": [
    "cas_rns = get_substructure_cas('[Se]')\n",
    "print(len(cas_rns))\n",
    "print(cas_rns[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON\n",
      "DEBUG:pubchempy:Request data: smiles=%5BTi%5D\n",
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/1812119792714669902/synonyms/JSON\n",
      "DEBUG:pubchempy:Request data: None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1630\n",
      "[u'13463-67-7', u'1317-80-2', u'1317-70-0', u'98084-96-9', u'100292-32-8', u'101239-53-6', u'116788-85-3', u'12000-59-8', u'12036-20-3', u'12701-76-7']\n"
     ]
    }
   ],
   "source": [
    "cas_rns = get_substructure_cas('[Ti]')\n",
    "print(len(cas_rns))\n",
    "print(cas_rns[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON\n",
      "DEBUG:pubchempy:Request data: smiles=%5BPd%5D\n",
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/2802290965277166497/synonyms/JSON\n",
      "DEBUG:pubchempy:Request data: None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1401\n",
      "[u'7440-05-3', u'17637-99-9', u'53092-86-7', u'7647-10-1', u'10038-97-8', u'10102-05-3', u'14846-30-1', u'884739-77-9', u'3375-31-3', u'19807-27-3']\n"
     ]
    }
   ],
   "source": [
    "cas_rns = get_substructure_cas('[Pd]')\n",
    "print(len(cas_rns))\n",
    "print(cas_rns[:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We could potentially get a TimeoutError if there are too many results. In this case, it might be better to perform the substructure search and then get the synonyms separately:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON\n",
      "DEBUG:pubchempy:Request data: smiles=%5BPd%5D\n",
      "DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/3838589667186536348/cids/JSON\n",
      "DEBUG:pubchempy:Request data: None\n"
     ]
    }
   ],
   "source": [
    "cids = pcp.get_cids('[Pd]', 'smiles', searchtype='substructure')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "Then you can do `pcp.get_synonyms(cids)` with the list of CIDs."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
